{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "from tqdm import tqdm\n",
    "import os\n",
    "import re"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "in_file_path = \"../../results/covid19_rel_lianxiangjia/ensemble/ensemble_res_3.json\"\n",
    "formatted_res_dir = \"../../results/covid19_rel_lianxiangjia/formatted_res\"\n",
    "sample_submission_path = \"../../raw_data/covid19_rel_lianxiangjia/task2_sample_submission.json\"\n",
    "\n",
    "if not os.path.exists(formatted_res_dir):\n",
    "    os.makedirs(formatted_res_dir)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# transform @specific\n",
    "def transform_data(data, sample_submission):\n",
    "    idx2rel_list = {}\n",
    "    for sample in tqdm(data, desc = \"Transforming\"):\n",
    "        idx = sample[\"id\"]\n",
    "        rel_list = []\n",
    "        rel_memory_set = set()\n",
    "        for rel in sample[\"relation_list\"]: \n",
    "            triplet = [\n",
    "                rel[\"subject\"],\n",
    "                rel[\"predicate\"],\n",
    "                rel[\"object\"],\n",
    "            ]\n",
    "            rel_m = \"{},{},{}\".format(*triplet)\n",
    "            if rel_m not in rel_memory_set:\n",
    "                rel_list.append(triplet)\n",
    "                rel_memory_set.add(rel_m)\n",
    "        idx2rel_list[idx] = rel_list\n",
    "    for sample in sample_submission:\n",
    "        idx = list(sample.keys())[0]\n",
    "        sample[idx] = idx2rel_list[idx] if idx in idx2rel_list else []\n",
    "    return sample_submission"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Transforming: 100%|██████████| 369/369 [00:00<00:00, 65549.88it/s]\n"
     ]
    }
   ],
   "source": [
    "ori_data = [json.loads(line) for line in open(in_file_path, \"r\", encoding = \"utf-8\")]\n",
    "sample_submission = json.load(open(sample_submission_path, \"r\", encoding = \"utf-8\"))\n",
    "formatted_data = transform_data(ori_data, sample_submission)\n",
    "\n",
    "in_file_name = in_file_path.split(\"/\")[-1].split(\".\")[0]\n",
    "out_file_path = os.path.join(formatted_res_dir, \"formatted_{}.json\".format(in_file_name))\n",
    "with open(out_file_path, \"w\", encoding = \"utf-8\") as file_out:\n",
    "    json.dump(formatted_data, file_out, ensure_ascii = False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "570"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(formatted_data)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
